{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.375753Z",
     "start_time": "2024-08-26T03:10:14.099523Z"
    }
   },
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import torchvision\n",
    "from torch import nn\n",
    "from d2l import torch as d2l\n",
    "from torch.utils.data import DataLoader, TensorDataset"
   ],
   "execution_count": 1,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.394913Z",
     "start_time": "2024-08-26T03:10:15.376754Z"
    }
   },
   "cell_type": "code",
   "source": [
    "train_data = pd.read_csv('./data/train.csv')\n",
    "test_data = pd.read_csv('./data/test.csv')"
   ],
   "id": "ae40a0041b60340b",
   "execution_count": 2,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.397432Z",
     "start_time": "2024-08-26T03:10:15.395648Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 训练数据集包括1460个样本，每个样本80个特征和1个标签\n",
    "# 测试数据集包含1459个样本，每个样本80个特征\n",
    "print(train_data.shape)\n",
    "print(test_data.shape)"
   ],
   "id": "2bd1c897dced3c5d",
   "execution_count": 3,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.407940Z",
     "start_time": "2024-08-26T03:10:15.398635Z"
    }
   },
   "cell_type": "code",
   "source": "train_data.head()",
   "id": "7b35707e34afee4",
   "execution_count": 4,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.416062Z",
     "start_time": "2024-08-26T03:10:15.408521Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 在每个样本中，第一个特征是ID，删除\n",
    "all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))"
   ],
   "id": "9127005a3742571",
   "execution_count": 5,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.425364Z",
     "start_time": "2024-08-26T03:10:15.417003Z"
    }
   },
   "cell_type": "code",
   "source": "all_features.head()",
   "id": "9571226706d05a1c",
   "execution_count": 6,
   "outputs": []
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 1. 数据预处理",
   "id": "3e5158f9f0a4a90c"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.442673Z",
     "start_time": "2024-08-26T03:10:15.425901Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 1. 数据预处理\n",
    "# 若无法获得测试数据，则可根据训练数据计算均值和标准差(缩放到零均值和单位方差 miu=0, sigma=1)\n",
    "numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index  # 数值类型特征\n",
    "all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))\n",
    "# 在标准化数据之后，所有均值消失，因此我们可以将缺失值设置为0\n",
    "all_features[numeric_features] = all_features[numeric_features].fillna(0)"
   ],
   "id": "2a21d920c95fb2f2",
   "execution_count": 7,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.460752Z",
     "start_time": "2024-08-26T03:10:15.443367Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# \"Dummy_na=True\" 将 \"NA\" (缺失值) 视为有效的特征值，并为其创建指示符特征，对于离散值，采用one-hot编码\n",
    "all_features = pd.get_dummies(all_features, dummy_na=True)\n",
    "all_features.shape  # 此转换会将特征的总数量从79个增加到331个"
   ],
   "id": "7d10570f41d9e6af",
   "execution_count": 8,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.466901Z",
     "start_time": "2024-08-26T03:10:15.461505Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 从pandas格式中提取NumPy格式，并将其转换为张量tensor表示用于训练\n",
    "n_train = train_data.shape[0]\n",
    "train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)\n",
    "test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)\n",
    "train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)"
   ],
   "id": "a87fb0ec1853ee97",
   "execution_count": 9,
   "outputs": []
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 2. 开始训练",
   "id": "59ba01ccbc7d1c9d"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.470360Z",
     "start_time": "2024-08-26T03:10:15.468769Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 2. 开始训练\n",
    "loss_fn = nn.MSELoss()\n",
    "in_features = train_features.shape[1]"
   ],
   "id": "156424991497e89b",
   "execution_count": 10,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.473237Z",
     "start_time": "2024-08-26T03:10:15.470880Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 采用单层线性回归\n",
    "net = nn.Sequential(nn.Linear(in_features=in_features, out_features=1))"
   ],
   "id": "ce8b92b95273cf3a",
   "execution_count": 11,
   "outputs": []
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "更关心相对误差 (y - y_hat)/y 而不是绝对误差 (y - y_hat)\n",
    "\n",
    "修改损失函数为 (log(yi) - log(yi_hat))**2 求和后除以 n 然后开根号"
   ],
   "id": "cf06c5343daa6590"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.475682Z",
     "start_time": "2024-08-26T03:10:15.473816Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def log_rmse(net, features, labels):\n",
    "    # 这一步仅仅为了在取对数时进一步稳定该值，将小于1的值设置为1\n",
    "    clipped_preds = torch.clamp(net(features), 1, float('inf'))\n",
    "    rmse = torch.sqrt(loss_fn(torch.log(clipped_preds), torch.log(labels)))\n",
    "    return rmse.item()"
   ],
   "id": "47ece44c4e3b93c7",
   "execution_count": 12,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.478878Z",
     "start_time": "2024-08-26T03:10:15.476232Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 训练函数\n",
    "def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay,\n",
    "          batch_size):\n",
    "    train_ls, test_ls = [], []\n",
    "\n",
    "    # 优化器Adam\n",
    "    learning_rate = learning_rate\n",
    "    optimizer = torch.optim.Adam(\n",
    "        params=net.parameters(),\n",
    "        lr=learning_rate,\n",
    "        weight_decay=weight_decay,\n",
    "    )\n",
    "\n",
    "    # 训练数据转换为DataLoader\n",
    "    train_datasets = TensorDataset(train_features, train_labels)\n",
    "    train_loaders = DataLoader(train_datasets, batch_size=batch_size, shuffle=True)\n",
    "\n",
    "    # 这里使用的是Adam优化算法\n",
    "    for epoch in range(num_epochs):\n",
    "        for X, y in train_loaders:\n",
    "            l = loss_fn(net(X), y)\n",
    "            optimizer.zero_grad()\n",
    "            l.backward()\n",
    "            optimizer.step()\n",
    "        train_ls.append(log_rmse(net, train_features, train_labels))\n",
    "\n",
    "        if test_labels is not None:\n",
    "            test_ls.append(log_rmse(net, test_features, test_labels))\n",
    "\n",
    "    return train_ls, test_ls\n"
   ],
   "id": "62ca73d24f54e94e",
   "execution_count": 13,
   "outputs": []
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 3. K-折交叉验证",
   "id": "9f3bc97c317afa92"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:15.481731Z",
     "start_time": "2024-08-26T03:10:15.479396Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def get_k_fold_data(k, i, X, y):\n",
    "    assert k > 1\n",
    "    fold_size = X.shape[0] // k\n",
    "    X_train, y_train = None, None\n",
    "    for j in range(k):\n",
    "        idx = slice(j * fold_size, (j + 1) * fold_size)\n",
    "        X_part, y_part = X[idx, :], y[idx]\n",
    "        if j == i:\n",
    "            X_valid, y_valid = X_part, y_part\n",
    "        elif X_train is None:\n",
    "            X_train, y_train = X_part, y_part\n",
    "        else:\n",
    "            X_train = torch.cat([X_train, X_part], 0)\n",
    "            y_train = torch.cat([y_train, y_part], 0)\n",
    "    return X_train, y_train, X_valid, y_valid"
   ],
   "id": "2aab021fec12ee80",
   "execution_count": 14,
   "outputs": []
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-26T03:10:28.763690Z",
     "start_time": "2024-08-26T03:10:28.756618Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 当我们在K-折交叉验证中训练K次后，返回训练和验证误差的平均值\n",
    "def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):\n",
    "    train_l_sum, valid_l_sum = 0, 0\n",
    "    for i in range(k):\n",
    "        data = get_k_fold_data(k, i, X_train, y_train)\n",
    "        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)\n",
    "        train_l_sum += train_ls[-1]\n",
    "        valid_l_sum += valid_ls[-1]\n",
    "        if i == 0:\n",
    "            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],\n",
    "                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],\n",
    "                     legend=['train', 'valid'], yscale='log')\n",
    "        print('折{}，训练log rmse = {:.6f}, 验证log rmse = {:.6f}'.format(i + 1, float(train_ls[-1]), float(valid_ls[-1])))\n",
    "\n",
    "    return train_l_sum / k, valid_l_sum / k"
   ],
   "id": "6baa6d265901f30c",
   "execution_count": 15,
   "outputs": []
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# 4. 模型选择\n",
    "\n",
    "(即利用K-折交叉验证，选择最优的超参数，然后在对所有训练集进行训练)"
   ],
   "id": "d680ebc63fb4a074"
  },
  {
   "metadata": {
    "jupyter": {
     "is_executing": true
    }
   },
   "cell_type": "code",
   "source": [
    "k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64\n",
    "train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)\n",
    "print('{}-折验证: 平均训练log rmse = {:.6f}, 平均验证log rmse = {:.6f}'.format(k, float(train_l), float(valid_l)))"
   ],
   "id": "a0d2063ac38dd1e5",
   "execution_count": null,
   "outputs": []
  },
  {
   "metadata": {},
   "cell_type": "code",
   "execution_count": null,
   "source": "",
   "id": "89173f690395cafd",
   "outputs": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
